Find out how many UP and DOWN genes are also present in the
consensus signature
upInDf <-dff[dff$gene %in% up_genes$gene,]
setdiff(up_genes$gene,dff$gene)
## [1] "ACKR2" "R3HDML"
upInDf <-upInDf[order(upInDf$logFC, decreasing = TRUE),]
upInDf_sg <-upInDf[upInDf$`pv-adj` <= 0.05,]
dwInDf <-dff[dff$gene %in% dw_genes$gene,]
setdiff(dw_genes$gene,dff$gene)
## character(0)
dwInDf <-dwInDf[order(dwInDf$logFC, decreasing = TRUE),]
dwInDf_sg <-dwInDf[dwInDf$`pv-adj` <= 0.05,]
Venn Plot of Consensus genes and DE Validation genes
up_v <-dff[dff$logFC >=1.5 & dff$`pv-adj` <= 0.05,]
up_v <-up_v[complete.cases(up_v),] #315
dw_v <-dff[dff$logFC <= -1.5 & dff$`pv-adj` <= 0.05,]
dw_v <-dw_v[complete.cases(dw_v),] #320
DE_Validation <- c(up_v$gene,dw_v$gene)
consensus_sig <- c(up_genes$gene,dw_genes$gene)
length(intersect(dw_v$gene,dw_genes$gene))
## [1] 3
length(intersect(up_v$gene,up_genes$gene))
## [1] 28
dff2 <-dff[dff$gene %in% DE_Validation,]
dff2 <-dff2[complete.cases(dff2$gene),]
v <-list(DE_Validation=DE_Validation, consensus_sig=consensus_sig)
names(v) <-c("DE genes in Validation Dataset","Consensus signature")
ggvenn(v, show_elements = F,show_percentage = T, text_size =17, set_name_size = 10,
stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15))

Fisher test to assess significance for the intersection
load(paste0(MEDIAsave,"combined_full_list.RData"))
conting_table <-data.frame("sign_in_Val"=rep(NA,2),"NOTsign_in_Val"=rep(NA,2))
rownames(conting_table) <-c("sign_in_Cons","NOTsign_in_Cons")
conting_table["sign_in_Cons","sign_in_Val"] <-length(intersect(DE_Validation,consensus_sig)) #31
conting_table["sign_in_Cons","NOTsign_in_Val"] <-length(setdiff(consensus_sig,DE_Validation)) #13
comb_notSig <-combined[!combined$gene %in% consensus_sig,] #13120=13164-44
Val_notSig <-dff[!dff$gene %in% dff2$gene,] #11811=12446-635
Val_notCons <-dff2[!dff2$gene %in% consensus_sig,]
conting_table["NOTsign_in_Cons","sign_in_Val"] <-nrow(Val_notCons) #604
conting_table["NOTsign_in_Cons","NOTsign_in_Val"]<-length(intersect(comb_notSig$gene,Val_notSig$gene)) #10241
conting_table
## sign_in_Val NOTsign_in_Val
## sign_in_Cons 31 13
## NOTsign_in_Cons 604 10241
fisher.test(conting_table)
##
## Fisher's Exact Test for Count Data
##
## data: conting_table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 20.40468 84.37084
## sample estimates:
## odds ratio
## 40.38563
rm(comb_notSig,Val_notCons,Val_notSig)
write_xlsx(dff2, path=paste0(MEDIAsave,"DEValidationSelected.xlsx"))
png(file = paste0(MEDIAgraph,"Fig5a_venn.png"), width = 14, height = 8, res=300, units = "in")
ggvenn(v, show_elements = F,show_percentage = T, text_size =14, set_name_size = 9,
stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15,margin = margin(0,0,15,0)))
dev.off()
rm(dff2)
norm4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=1)) #Normal samples data
rownames(norm4dataset) <-norm4dataset$symbol
norm4dataset <-norm4dataset[,-1]
dim(norm4dataset)
## [1] 23710 18
oa4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=2)) #OA samples data
rownames(oa4dataset) <-oa4dataset$symbol
oa4dataset <-oa4dataset[,-1]
dim(oa4dataset)
## [1] 23710 20
norm4dataset <-norm4dataset[rownames(oa4dataset),]
newdataset <-cbind(norm4dataset,oa4dataset)
coldata <-data.frame(class=c(rep("N",18),rep("OA",20)))
coldata$class <-as.factor(coldata$class)
coldata$info <-colnames(newdataset)
coldata$condition <-"cart"
coldata[9:18,"condition"] <-"all"
coldata[19:28,"condition"] <-"cart"
coldata[29:38,"condition"] <-"all"
coldata$condition <-as.factor(coldata$condition)
rownames(coldata) <-coldata$info
newdataset <- newdataset[, rownames(coldata)]
save(newdataset, file=paste0(MEDIAsave,"datasetValidation_full.RData"))
dds <- DESeqDataSetFromMatrix(countData = newdataset,
colData = coldata,
design = ~condition + class)
keep <- rowSums(assay(dds)>=5) >= 19 #at least 50% of the patients
dds <- dds[keep,]
dds$class <- factor(dds$class, levels = c("N","OA"))
dds$class <- relevel(dds$class, ref = "N")
dds$condition <- factor(dds$condition, levels = c("all","cart"))
nrow(dds)
## [1] 14643
Batch correction
mm <- model.matrix(~class, colData(vsd))
assay(vsd)<-removeBatchEffect(assay(vsd), batch = vsd$condition,design=mm)
pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+ggtitle("All samples without batch")+ theme_bw()

HEATMAP of consensus signature on the Validation dataset (z-score),
batch corrected
normdata <-assay(vsd)
normdata1 <-t(apply(normdata,1,scale))
colnames(normdata1) <-colnames(normdata)
normdata <-normdata1
signature <-c(up_genes$gene,dw_genes$gene)
setdiff(signature,rownames(normdata)) #""ACKR2" "TMEM59L" "R3HDML" UP gene missing
## [1] "ACKR2" "TMEM59L" "R3HDML"
up2 <-up_genes$gene[up_genes$gene %in% rownames(normdata)]
normdata <-normdata[c(up2,dw_genes$gene),]
dim(normdata)
## [1] 41 38
coldataAnn <-coldata
coldataAnn$col2 <-"plum"
coldataAnn[coldataAnn$class=="OA","col2"] <-"cyan"
rowdata <-data.frame(genes=c(up2,dw_genes$gene))
rowdata$col3 <-c(rep("red",36),rep("blue",5))
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(16,10),
ColSideColors = coldataAnn$col2,
ColSideLabs = "Class",
RowSideColors = rowdata$col3,
RowSideLabs = "DE genes integration",
cexRow = 0.9, cexCol = 0.9)
legend(0.8,1,legend=c("OA","Normal"),
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
fill=c("red","blue"), bty = "n", title = "DE")

png(file = paste0(MEDIAgraph,"Fig5b_ValHeatmap.png"), width = 15, height = 12.4, res=300, units = "in")
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(16,10),
ColSideColors = coldataAnn$col2,
ColSideLabs = "Class",
RowSideColors = rowdata$col3,
RowSideLabs = "DE genes integration",
cexRow = .8, cexCol = .9)
legend(0.8,1,legend=c("OA","Normal"),
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
fill=c("red","blue"), bty = "n", title = "DE")
dev.off()
Enrichment analysis of consensus signature on Validation Dataset:
GSEA Running Plot
sig <-data.frame(gs_name="Consensus signature",gene_symbol=c(up_genes$gene,dw_genes$gene))
colnames(sig) <-c("gs_name","gene_symbol")
colnames(sig) <-c("term_id","gene_id")
gene <-dff$logFC
names(gene) <-dff$gene
gene <-sort(gene,decreasing = TRUE)
set.seed(2459)
gs <- clusterProfiler::GSEA(gene,TERM2GENE = sig,
minGSSize = 5,
eps = 0,
pAdjustMethod = "fdr",
pvalueCutoff =0.01)
gseaplot2(gs, title ="Running score GSEA plot - validation dataset", geneSetID = 1, pvalue_table = T)
